import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
train_texts = pd.read_csv('./fake_news/train.csv').dropna(subset='text').text
test_texts = pd.read_csv('./fake_news/test.csv').dropna(subset='text').text
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
# Retrieve stopwords from all of the available languages into one set
stop_words = set(sum([stopwords.words(language) for language in stopwords.fileids()], []))
def clean(text):
words = word_tokenize(text)
words = [stemmer.stem(w.lower()) for w in words if # Stem and lower
not w in stop_words # Not a stop word
and w.isalpha()] # Only contains letters
return " ".join(words)
from tqdm import tqdm
tqdm.pandas()
train_texts = train_texts.progress_apply(clean)
test_texts = test_texts.progress_apply(clean)
100%|████████████████████████████████████| 20761/20761 [02:16<00:00, 152.42it/s] 100%|██████████████████████████████████████| 5193/5193 [00:34<00:00, 150.75it/s]
train_vectorizer = TfidfVectorizer(min_df=0.005, max_df=0.6)
X_train = train_vectorizer.fit_transform(train_texts)
X_test = train_vectorizer.transform(test_texts)
X_train.shape, X_test.shape
((20761, 5551), (5193, 5551))
len_train = X_train.shape[0]
len_test = X_test.shape[0]
combined = np.concatenate([X_train.A, X_test.A])
combined_embedded = TSNE(n_components=2, perplexity=10, init='pca').fit_transform(combined)
train_embedded = combined_embedded[:len_train]
test_embedded = combined_embedded[len_train:]
assert train_embedded.shape[0] == len_train and test_embedded.shape[0] == len_test
/Users/anvil/Documents/Projects/Metaverse_mind_lab_tha/venv/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn( /Users/anvil/Documents/Projects/Metaverse_mind_lab_tha/venv/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:991: FutureWarning: The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence. warnings.warn(
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='notebook'
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_embedded[:, 0], y=train_embedded[:, 1], mode='markers'))
fig.add_trace(go.Scatter(x=test_embedded[:, 0], y=test_embedded[:, 1], mode='markers'))